Install additional packages

In [1]:
# # install custom packages - for google collab
# !pip install datashader
# !pip install hdbscan

Load Libraries

In [2]:
from platform import python_version

print("python {}".format(python_version()))

import pandas as pd
import numpy as np

print("pandas {}".format(pd.__version__))
print("numpy {}".format(np.__version__))
python 3.6.9
pandas 1.0.5
numpy 1.18.5
In [3]:
import seaborn as sns; sns.set()

from scipy.spatial import ConvexHull, convex_hull_plot_2d
import matplotlib.pyplot as plt
In [4]:
import holoviews as hv
import holoviews.operation.datashader as hd
import datashader as ds
import datashader.transfer_functions as tf

hd.shade.cmap=["lightblue", "darkblue"]
hv.extension('bokeh', 'matplotlib') 
# https://datashader.org/getting_started/Interactivity.html

# https://stackoverflow.com/questions/54793910/how-to-make-the-holoviews-show-graph-on-google-colaboratory-notebook
%env HV_DOC_HTML=true
env: HV_DOC_HTML=true

Data Preparation

Parsing

In [5]:
# set option to process raw data, False will read parsed data directly
DATA_OPTION_PRCESS_RAW = False

# set number of rows to work with
DATA_OPTION_NUM_ROWS = 2307 # total row of data - 2307
#DATA_OPTION_NUM_ROWS = None # all rows

# set paths to data files
RAW_DATA_FILE = 'raw_data/competition_dataset.csv'
PARSED_DATA_FILE = 'intermediate_data/competition_dataset_long_{}.csv'.format(DATA_OPTION_NUM_ROWS)
In [6]:
if DATA_OPTION_PRCESS_RAW:

    # read raw data to process into parsed data
    raw_df = pd.read_csv(RAW_DATA_FILE, header=0, skiprows=0,
                         nrows=DATA_OPTION_NUM_ROWS, delimiter=None)

    parsed_df = raw_df.copy()
    parsed_df['data'] = parsed_df.iloc[:, 0].str.split('; ')
    parsed_df['count'] = parsed_df['data'].str.len()
    parsed_df['count'] = (parsed_df['count'] - 4 - 1) / 6
    parsed_df['count'] = parsed_df['count'].astype(int)

    # credit: https://stackoverflow.com/a/59552714
    spread_ixs = np.repeat(range(len(parsed_df)), parsed_df['count'])
    # .drop(columns='count').reset_index(drop=True)
    parsed_df = parsed_df.iloc[spread_ixs, :]

    parsed_df['track_id'] = parsed_df['data'].str[0].astype(int)
    parsed_df['grouped_row_id'] = parsed_df.groupby(
        'track_id')['track_id'].rank(method='first').astype(int)

    old_col = raw_df.columns.tolist()[0]
    new_cols = old_col.split('; ')

    # build columns
    parsed_df['track_id'] = parsed_df['data'].apply(lambda x: x[0])
    parsed_df['type'] = parsed_df['data'].apply(lambda x: x[1])
    parsed_df['traveled_d'] = parsed_df['data'].apply(lambda x: x[2])
    parsed_df['avg_speed'] = parsed_df['data'].apply(lambda x: x[3])

    parsed_df['lat'] = parsed_df.apply(
        lambda row: row['data'][4+(row['grouped_row_id']-1)*6], axis=1)
    parsed_df['lon'] = parsed_df.apply(
        lambda row: row['data'][5+(row['grouped_row_id']-1)*6], axis=1)
    parsed_df['speed'] = parsed_df.apply(
        lambda row: row['data'][6+(row['grouped_row_id']-1)*6], axis=1)
    parsed_df['lon_acc'] = parsed_df.apply(
        lambda row: row['data'][7+(row['grouped_row_id']-1)*6], axis=1)
    parsed_df['lat_acc'] = parsed_df.apply(
        lambda row: row['data'][8+(row['grouped_row_id']-1)*6], axis=1)
    parsed_df['time'] = parsed_df.apply(
        lambda row: row['data'][9+(row['grouped_row_id']-1)*6], axis=1)

    # clean up columns
    parsed_df = parsed_df.drop(columns=old_col)
    parsed_df = parsed_df.drop(
        columns=['count',
                 'grouped_row_id',
                 'data']
    ).reset_index(drop=True)
    parsed_df = parsed_df.reset_index(drop=False).rename(
        columns={'index': 'record_id'})

    # output to file
    parsed_df.to_csv(PARSED_DATA_FILE, index=False)
    parsed_df.head(5)

else:
    # read parsed data
    parsed_df = pd.read_csv(PARSED_DATA_FILE, header=0,
                            skiprows=0, delimiter=None)
    parsed_df['track_id'] = parsed_df['track_id'].astype(int)

# clean up unnamed index column - perhaps name it as record id?

Compute extra attributes

In [7]:
# calculate orientation
## bearing using acceleration (do not use as it provides inaccurate bearing)
parsed_df['acc_angle'] = np.arctan2(parsed_df['lat_acc'],
                                    parsed_df['lon_acc']) * 180 / np.pi  # lon = x, lat = y

## approximate bearing using acceleration (do not use as it provides inaccurate bearing)
parsed_df['appr_acc_angle'] = parsed_df['acc_angle'].round(-1)

# https://stackoverflow.com/questions/1016039/determine-the-general-orientation-of-a-2d-vector
# https://numpy.org/doc/stable/reference/generated/numpy.arctan2.html
# np.arctan2(y, x) * 180 / np.pi
In [8]:
# compute x and y corrdinates
# this improves the ease of calculating distances, especially for clustering

from datashader.utils import lnglat_to_meters

parsed_df.loc[:, 'x'], parsed_df.loc[:, 'y'] = lnglat_to_meters(parsed_df.lon, parsed_df.lat)
In [9]:
# calculate bearing based on next position

shifted = parsed_df[['track_id', 'x', 'y']].\
    groupby("track_id").\
    shift(-1).\
    rename(columns=lambda x: x+"_lag")
parsed_df = parsed_df.join(shifted)

# https://stackoverflow.com/questions/5058617/bearing-between-two-points


def gb(x1, x2, y1, y2):
    angle = np.arctan2(y1 - y2, x1 - x2) * 180 / np.pi
    #     bearing1 = (angle + 360) % 360
    bearing2 = (90 - angle) % 360

    return(bearing2)


parsed_df['bearing'] = gb(
    x1=parsed_df['x'],
    x2=parsed_df['x_lag'],
    y1=parsed_df['y'],
    y2=parsed_df['y_lag'])
In [10]:
# impute bearing of first points

parsed_df = parsed_df.sort_values(
    by='record_id', axis=0)  # make sure record is in order

shifted = parsed_df[['track_id', 'bearing']].\
    groupby("track_id").\
    shift(1).\
    rename(columns=lambda x: x+"_lead")
parsed_df = parsed_df.join(shifted)

# if bearing is null, take the previous bearing for the track id
parsed_df['bearing'] = np.where(parsed_df['bearing'].isnull(),
                                parsed_df['bearing_lead'], parsed_df['bearing'])
In [11]:
# there should be no more null bearing

parsed_df[parsed_df['bearing'].isnull()]#[['record_id','count']]
Out[11]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc time acc_angle appr_acc_angle x y x_lag y_lag bearing bearing_lead

Data Exploration

In [12]:
parsed_df.head(10)
Out[12]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc time acc_angle appr_acc_angle x y x_lag y_lag bearing bearing_lead
0 0 1 Car 16.7 15.820474 37.991016 23.735396 12.4578 -0.0633 0.0169 0.00 165.051683 170.0 2.642212e+06 4.578157e+06 2.642212e+06 4.578157e+06 270.000000 NaN
1 1 1 Car 16.7 15.820474 37.991016 23.735397 12.4544 0.0154 0.0193 0.04 51.412672 50.0 2.642212e+06 4.578157e+06 2.642213e+06 4.578157e+06 270.000000 270.000000
2 2 1 Car 16.7 15.820474 37.991016 23.735399 12.4601 0.0638 0.0215 0.08 18.623333 20.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578157e+06 270.000000 270.000000
3 3 1 Car 16.7 15.820474 37.991016 23.735400 12.4727 0.1122 0.0238 0.12 11.976132 10.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578157e+06 302.392324 270.000000
4 4 1 Car 16.7 15.820474 37.991015 23.735402 12.4957 0.2060 0.0259 0.16 7.166091 10.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578157e+06 270.000000 302.392324
5 5 1 Car 16.7 15.820474 37.991015 23.735403 12.5353 0.3451 0.0279 0.20 4.622089 0.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578157e+06 270.000000 270.000000
6 6 1 Car 16.7 15.820474 37.991015 23.735405 12.5983 0.5295 0.0298 0.24 3.221180 0.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578156e+06 321.758097 270.000000
7 7 1 Car 16.7 15.820474 37.991014 23.735406 12.6863 0.6928 0.0318 0.28 2.628071 0.0 2.642213e+06 4.578156e+06 2.642214e+06 4.578156e+06 270.000000 321.758097
8 8 1 Car 16.7 15.820474 37.991014 23.735408 12.7946 0.8108 0.0337 0.32 2.380065 0.0 2.642214e+06 4.578156e+06 2.642214e+06 4.578156e+06 270.000000 270.000000
9 9 1 Car 16.7 15.820474 37.991014 23.735409 12.9188 0.9151 0.0357 0.36 2.234097 0.0 2.642214e+06 4.578156e+06 2.642214e+06 4.578156e+06 270.000000 270.000000
In [13]:
len(parsed_df)
Out[13]:
6322772

Variable Plots

In [14]:
# speed vs time - 25 vehicles

dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df=parsed_df[(parsed_df['track_id']>100) & (parsed_df['track_id']<105)]\

ax = sns.scatterplot(
    x="time",
    y="speed",
#     hue="track_id",
    marker='x',
    s=0.2,
    data=df)
In [15]:
# lat lon - 25 vehicles

dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df = parsed_df[(parsed_df['track_id']>100) & (parsed_df['track_id']<125)]

ax = sns.scatterplot(
    x="lon",
    y="lat",
#     hue="track_id",
    marker='+',
    s=1,
    data=df)
In [16]:
# lat lon - all vehicles

dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)

ax = sns.scatterplot(
    x="lon",
    y="lat",
    #hue="track_id",
    marker='x',
    s=0.2,
    data=parsed_df)
In [17]:
# lat lon - stopped only - speed <1

dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df = parsed_df[parsed_df['speed']<1]

ax = sns.scatterplot(
    x="lon",
    y="lat",
    #hue="track_id",
    marker='x',
    s=0.5,
    data=df)
In [18]:
# lat lon - at a certain time frame with low speed

dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df = parsed_df[(parsed_df['time'] == 0) & (parsed_df['speed'] < 1)]

ax = sns.scatterplot(
    x="lon",
    y="lat",
    #     hue="type",
    # style="speed",
    marker='x',
    s=20,
    data=df)

Datashader visualizations

In [19]:
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters

df = parsed_df.copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
Out[19]:
In [20]:
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
# https://datashader.org/getting_started/Interactivity.html

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
from datashader.colors import Sets1to3

df = parsed_df.copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
plot = hd.datashade(points, aggregator=ds.count_cat('type')).opts(hv.opts(width=750, height=350))
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))

color_key = [(name,color) for name,color in zip(['Car', 'Medium Vehicle', 'Motorcycle', 'Heavy Vehicle', 'Bus',
       'Taxi'], Sets1to3)]
color_points = hv.NdOverlay({n: hv.Points(df.iloc[0:1,:], label=str(n)).opts(style=dict(color=c)) for n,c in color_key})

#tiles.StamenTerrain() * 
tiles.CartoLight() * plot * color_points 
Out[20]:
In [21]:
# Car only

# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters

df = parsed_df[parsed_df['type']=='Car'].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
Out[21]:
In [22]:
# Buses only

# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters

df = parsed_df[parsed_df['type']=='Car'].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
Out[22]:
In [23]:
# ~ Stationary points only

# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters

df = parsed_df[(parsed_df['speed']==0)].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
Out[23]:
In [24]:
# ~ moving points only (>0)

# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters

df = parsed_df[(parsed_df['speed']>0)].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
Out[24]:

Model Development

In [25]:
parsed_df.head(5)
Out[25]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc time acc_angle appr_acc_angle x y x_lag y_lag bearing bearing_lead
0 0 1 Car 16.7 15.820474 37.991016 23.735396 12.4578 -0.0633 0.0169 0.00 165.051683 170.0 2.642212e+06 4.578157e+06 2.642212e+06 4.578157e+06 270.000000 NaN
1 1 1 Car 16.7 15.820474 37.991016 23.735397 12.4544 0.0154 0.0193 0.04 51.412672 50.0 2.642212e+06 4.578157e+06 2.642213e+06 4.578157e+06 270.000000 270.000000
2 2 1 Car 16.7 15.820474 37.991016 23.735399 12.4601 0.0638 0.0215 0.08 18.623333 20.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578157e+06 270.000000 270.000000
3 3 1 Car 16.7 15.820474 37.991016 23.735400 12.4727 0.1122 0.0238 0.12 11.976132 10.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578157e+06 302.392324 270.000000
4 4 1 Car 16.7 15.820474 37.991015 23.735402 12.4957 0.2060 0.0259 0.16 7.166091 10.0 2.642213e+06 4.578157e+06 2.642213e+06 4.578157e+06 270.000000 302.392324
In [26]:
parsed_df.describe()
Out[26]:
record_id track_id traveled_d avg_speed lat lon speed lon_acc lat_acc time acc_angle appr_acc_angle x y x_lag y_lag bearing bearing_lead
count 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.322772e+06 6.320465e+06 6.320465e+06 6.322772e+06 6.320465e+06
mean 3.161386e+06 1.069232e+03 4.207995e+02 1.148842e+01 3.799135e+01 2.373267e+01 1.150219e+01 -4.390482e-03 -3.161465e-03 4.235116e+02 1.257496e+00 1.257515e+00 2.641909e+06 4.578205e+06 2.641909e+06 4.578205e+06 1.156028e+02 1.155971e+02
std 1.825227e+06 6.047412e+02 1.837347e+02 7.751485e+00 5.056523e-04 1.454844e-03 1.397968e+01 7.738723e-01 3.236074e-01 2.348440e+02 9.773023e+01 9.796654e+01 1.619525e+02 7.142322e+01 1.619103e+02 7.138357e+01 7.496910e+01 7.495537e+01
min 0.000000e+00 1.000000e+00 1.010000e+00 8.316500e-02 3.798952e+01 2.373028e+01 0.000000e+00 -5.866750e+01 -1.043290e+01 0.000000e+00 -1.800000e+02 -1.800000e+02 2.641642e+06 4.577946e+06 2.641642e+06 4.577946e+06 0.000000e+00 0.000000e+00
25% 1.580693e+06 5.360000e+02 2.889100e+02 6.985240e+00 3.799109e+01 2.373140e+01 0.000000e+00 -1.439000e-01 -1.400000e-02 2.194800e+02 -7.690969e+00 -1.000000e+01 2.641767e+06 4.578167e+06 2.641767e+06 4.578167e+06 9.000000e+01 9.000000e+01
50% 3.161386e+06 1.079000e+03 4.506700e+02 1.002152e+01 3.799139e+01 2.373239e+01 4.491800e+00 0.000000e+00 0.000000e+00 4.381600e+02 0.000000e+00 0.000000e+00 2.641877e+06 4.578210e+06 2.641877e+06 4.578210e+06 9.000000e+01 9.000000e+01
75% 4.742078e+06 1.581000e+03 5.739000e+02 1.453159e+01 3.799161e+01 2.373370e+01 2.094390e+01 9.042500e-02 1.620000e-02 6.278000e+02 1.006080e+01 1.000000e+01 2.642023e+06 4.578240e+06 2.642023e+06 4.578240e+06 1.129262e+02 1.129262e+02
max 6.322771e+06 2.307000e+03 1.291590e+03 6.901287e+01 3.799281e+01 2.373652e+01 9.864200e+01 7.344050e+01 1.664850e+01 8.130000e+02 1.800000e+02 1.800000e+02 2.642337e+06 4.578410e+06 2.642336e+06 4.578410e+06 3.510428e+02 3.510428e+02
In [27]:
# utility - hdbscan clustering

# https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html

import hdbscan


def cluster_hdbscan(df,
                    parameters=None,
                    feature_names=['x', 'y'],
                    label_name='unnamed_cluster',
                    verbose=True):

    df = df.copy()

    default_parameters = {
        'metric': 'euclidean',
        'min_cluster_size': 200,
        'min_samples': None,
        'cluster_selection_epsilon': 7
    }

    if(parameters == None):
        parameters = default_parameters
    else:
        default_parameter_names = list(default_parameters.keys())
        parameter_names = list(parameters.keys())

        for parameter in default_parameter_names:
            if(parameter not in parameter_names):
                parameters[parameter] = default_parameters[parameter]

    clusterer = hdbscan.HDBSCAN(
        metric=parameters['metric'],
        min_cluster_size=parameters['min_cluster_size'],
        min_samples=parameters['min_samples'],
        cluster_selection_epsilon=parameters['cluster_selection_epsilon']
    )

    clusterer.fit(df[feature_names])

    df[label_name] = clusterer.labels_

    if verbose:
        print('hdbscan trained on: ' + str(parameters))

    return(df)
In [28]:
# utility - dbscan clustering

from sklearn.cluster import DBSCAN
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html


def cluster_dbscan(df,
                   parameters=None,
                   feature_names=['x', 'y'],
                   label_name='unnamed_cluster',
                   verbose=True):

    df = df.copy()

#     default_parameters = {
#         'metric': 'euclidean',
#         'min_cluster_size': 200,
#         'min_samples': None,
#         'cluster_selection_epsilon': 7
#     }
    clusterer = DBSCAN(
        eps=parameters['cluster_selection_epsilon'],
        min_samples=parameters['min_samples'],
    ).fit(df[feature_names])

    df[label_name] = clusterer.labels_

    if verbose:
        print('dbscan trained on: ' + str(parameters))

    return(df)
In [29]:
# utility - kmeans clustering

# https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html

from sklearn.cluster import KMeans


def cluster_kmeans(df,
                   n_clusters=4,
                   feature_names=['bearing_median'],
                   label_name='unnamed_cluster',
                   verbose=True):

    df = df.copy()

    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(
        df[feature_names])
    df[label_name] = kmeans.labels_

    if verbose:
        print('kmeans trained on: ' + str(n_clusters) +
              " clusters and " + str(feature_names))

    return(df)

Road Segment Clustering

Clustering roadway segments to identify apporach and major road / intersection.

Prepare segment clustering training data

In [30]:
# prep training data

df = parsed_df  # [(parsed_df['speed']<5)].copy() # ~ bottom 75% speeds
df['record_id'] = df['record_id']
#df['type'] = df['type']
seg_all_df = df[['x', 'y', 'bearing',
                 'record_id']].set_index('record_id')
#seg_all_df = seg_all_df.head(100000)

# rounding is not a good idea
#seg_all_df['x'] = seg_all_df['x'].round(1)
#seg_all_df['y'] = seg_all_df['y'].round(1)

# set count
seg_all_df['count'] = 1

# get count and angle by unique location
seg_all_df = seg_all_df.\
    groupby(['x', 'y']).\
    agg({"count": np.sum, 'bearing': np.median}).\
    reset_index()

# get total and pct of count
seg_all_df['total_count'] = seg_all_df['count'].sum()
seg_all_df['count_pct'] = seg_all_df['count'] / \
    seg_all_df['total_count'] * 100

# save all data for unique points
seg_all_df = seg_all_df.reset_index(
    drop=False).rename(columns={'index': 'u_id'}).set_index('u_id')

### DENSITY REDUCTION ###
# # filter out unique points with fewer than 0.05% of total points
# seg_all_df = seg_all_df[seg_all_df['count_pct'] > 0.05]

# # filter out unique points with fewer than 0.0001% of total points (1 in mil)
# seg_all_df = seg_all_df[seg_all_df['count_pct']>0.0002]

# filter out infreq points (points with less than 10 samples) for training
# this helps reduce data size and introduce breaks in low density areas of the data
seg_train_df = seg_all_df[seg_all_df['count'] > 10]
seg_infre_df = seg_all_df[seg_all_df['count'] <= 10]

# choose features to be trained on - not needed!
# seg_train_df = seg_train_df[['x', 'y', 'count', 'count_pct']]
# seg_train_df = seg_train_df[['x', 'y', 'bearing']]
# seg_train_df = seg_train_df[['x', 'y']]
In [31]:
# full dataset of unique points (all points)
seg_all_df
Out[31]:
x y count bearing total_count count_pct
u_id
0 2.641642e+06 4.578215e+06 1 90.000000 6322772 0.000016
1 2.641643e+06 4.578215e+06 1 90.000000 6322772 0.000016
2 2.641643e+06 4.578215e+06 1 141.758255 6322772 0.000016
3 2.641643e+06 4.578215e+06 1 141.758255 6322772 0.000016
4 2.641643e+06 4.578216e+06 1 90.000000 6322772 0.000016
... ... ... ... ... ... ...
1232848 2.642337e+06 4.578158e+06 1 90.000000 6322772 0.000016
1232849 2.642337e+06 4.578156e+06 1 104.239498 6322772 0.000016
1232850 2.642337e+06 4.578157e+06 1 100.274225 6322772 0.000016
1232851 2.642337e+06 4.578158e+06 1 101.940815 6322772 0.000016
1232852 2.642337e+06 4.578158e+06 1 99.012488 6322772 0.000016

1232853 rows × 6 columns

In [32]:
# training dataset of unique points (only frequent points)
seg_train_df
Out[32]:
x y count bearing total_count count_pct
u_id
14 2.641643e+06 4.578215e+06 638 90.000000 6322772 0.010091
29 2.641644e+06 4.578214e+06 576 90.000000 6322772 0.009110
34 2.641644e+06 4.578215e+06 632 90.000000 6322772 0.009996
97 2.641644e+06 4.578215e+06 131 90.000000 6322772 0.002072
169 2.641645e+06 4.578215e+06 12 90.000000 6322772 0.000190
... ... ... ... ... ... ...
1230090 2.642330e+06 4.578149e+06 11 90.000000 6322772 0.000174
1230195 2.642330e+06 4.578149e+06 15 90.000000 6322772 0.000237
1230224 2.642330e+06 4.578155e+06 11 104.239496 6322772 0.000174
1230287 2.642330e+06 4.578149e+06 715 90.000000 6322772 0.011308
1231810 2.642333e+06 4.578148e+06 211 90.000000 6322772 0.003337

34997 rows × 6 columns

In [33]:
# infrequent data points excluded from training
seg_infre_df
Out[33]:
x y count bearing total_count count_pct
u_id
0 2.641642e+06 4.578215e+06 1 90.000000 6322772 0.000016
1 2.641643e+06 4.578215e+06 1 90.000000 6322772 0.000016
2 2.641643e+06 4.578215e+06 1 141.758255 6322772 0.000016
3 2.641643e+06 4.578215e+06 1 141.758255 6322772 0.000016
4 2.641643e+06 4.578216e+06 1 90.000000 6322772 0.000016
... ... ... ... ... ... ...
1232848 2.642337e+06 4.578158e+06 1 90.000000 6322772 0.000016
1232849 2.642337e+06 4.578156e+06 1 104.239498 6322772 0.000016
1232850 2.642337e+06 4.578157e+06 1 100.274225 6322772 0.000016
1232851 2.642337e+06 4.578158e+06 1 101.940815 6322772 0.000016
1232852 2.642337e+06 4.578158e+06 1 99.012488 6322772 0.000016

1197856 rows × 6 columns

In [34]:
# visual inspect - lat lon

dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)

ax = sns.scatterplot(
    x='x',
    y='y',
    s=1,
    palette="black",
    # hue="count",
    # style="speed",
    marker='+',
    edgecolors='red',
    data=seg_train_df.copy())
In [35]:
# visual inspect - rasterize lat lon

# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
# https://datashader.org/getting_started/Interactivity.html
from datashader.colors import Sets1to3
# https://github.com/holoviz/datashader/issues/767
import colorcet as cc
long_key = list(set(cc.glasbey_cool + cc.glasbey_warm + cc.glasbey_dark))

df = seg_train_df.copy()
#df['seg_cluster'] = df['seg_cluster'].apply(lambda x: 0 if x >=0 else -1)
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 
tiles.CartoLight() * hd.datashade(points, 
             #aggregator=ds.count_cat('seg_cluster'), 
             #color_key=long_key
             ).opts(hv.opts(width=750, height=350))
#hd.dynspread(hd.datashade(points, 
#             aggregator=ds.count_cat('seg_cluster'), d
#             color_key=Sets1to3).opts(hv.opts(width=750, height=350)), threshold=0.4)
Out[35]:

HDBSCAN for Roadway Segment Clustering

In [36]:
# define subclustering parameters

seg_cluster_parameter = {    
    
    # x clusters of medium size segments
    'metric': 'euclidean',
    'min_cluster_size': 150,
    'min_samples': None,
    'cluster_selection_epsilon': 5
    
#     # 8 clusters of medium size segments
#     'metric': 'euclidean',
#     'min_cluster_size': 200,
#     'min_samples': None,
#     'cluster_selection_epsilon': 20

    # # 7 clusters of medium size segments
    #     'metric'='euclidean',
    #     'min_cluster_size'=300,
    #     'min_samples'=None,
    #     'cluster_selection_epsilon'=10

    # # 12 clusters of fine segments
    #     'metric'='euclidean',
    #     'min_cluster_size'=150,
    #     'min_samples'=None,
    #     'cluster_selection_epsilon'=5
}

# run subclustering for lanes
seg_train_df_1 = cluster_hdbscan(df=seg_train_df,
                               parameters=seg_cluster_parameter,
                               feature_names=['x', 'y'],
                               label_name='seg_cluster')
hdbscan trained on: {'metric': 'euclidean', 'min_cluster_size': 150, 'min_samples': None, 'cluster_selection_epsilon': 5}
In [37]:
len(seg_train_df_1['seg_cluster'].unique())
Out[37]:
13
In [38]:
# visual inspect clusters by facet plot

# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html

g = sns.FacetGrid(seg_train_df_1, col='seg_cluster', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=0.1)#, edgecolor="w")

# note: cluster 3 and 8&9 are of interest, manually merge 8 and 9
In [39]:
### HDBSCAN parameter tuning decisions ###
# https://hdbscan.readthedocs.io/en/latest/parameter_selection.html
# min_samples
# opt A for decision for min_sample for core points
# ~ 400*600m^2 (240,000 m^2 area)
# ~ 1 mil unique points (745,709 if no lone points)
# avg density of points or minimum eligible density should be ~ 5 points
# opt B for decision for min_sample for core points
# net area is ~ (based on rough calculation of roadway areas)
# 50*600 + 30*400 + 4 * 10*400 = 58,000
# ~ 1 mil unique points (745,709 if no lone points)
# avg density of points or minimum eligible density should be ~ 15 points
# option A or B generates way too many clusters, gradully increase min_samples for core points until less clusters are generated

# cluster_selection_epsilon
# 1.5m radius (or 3.0m width) is approx. lane width, use 5m for a typ. 3 lane roadway

# HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,
#    gen_min_span_tree=False, leaf_size=40, memory=Memory(cachedir=None),
#    metric='euclidean', min_cluster_size=5, min_samples=None, p=None)
In [40]:
# prepare data for second-stage training
seg_train_df_2 = seg_train_df_1[seg_train_df_1['seg_cluster']==-1]
In [41]:
seg_train_df_2
Out[41]:
x y count bearing total_count count_pct seg_cluster
u_id
42172 2.641706e+06 4.578087e+06 19424 90.000000 6322772 0.307207 -1
60117 2.641712e+06 4.578104e+06 108 90.000000 6322772 0.001708 -1
63179 2.641713e+06 4.578104e+06 107 90.000000 6322772 0.001692 -1
68975 2.641715e+06 4.578089e+06 11 11.146112 6322772 0.000174 -1
69302 2.641715e+06 4.578089e+06 11 11.146112 6322772 0.000174 -1
... ... ... ... ... ... ... ...
1230090 2.642330e+06 4.578149e+06 11 90.000000 6322772 0.000174 -1
1230195 2.642330e+06 4.578149e+06 15 90.000000 6322772 0.000237 -1
1230224 2.642330e+06 4.578155e+06 11 104.239496 6322772 0.000174 -1
1230287 2.642330e+06 4.578149e+06 715 90.000000 6322772 0.011308 -1
1231810 2.642333e+06 4.578148e+06 211 90.000000 6322772 0.003337 -1

3205 rows × 7 columns

In [42]:
# 2nd stage dbscan clustering
# with more relax parameters on unclustered point from 1st stage only

# define subclustering parameters

seg_cluster_parameter = {
    
    # x clusters of medium size segments
    'metric': 'euclidean',
    'min_cluster_size': 75,
    'min_samples': None,
    'cluster_selection_epsilon': 5
}

# run subclustering for lanes
seg_train_df_2 = cluster_hdbscan(df=seg_train_df_2,
                               parameters=seg_cluster_parameter,
                               feature_names=['x', 'y'],
                               label_name='seg_cluster')
hdbscan trained on: {'metric': 'euclidean', 'min_cluster_size': 75, 'min_samples': None, 'cluster_selection_epsilon': 5}
In [43]:
# seg_train_df_2[seg_train_df_2['seg_cluster']==-1]

# clustered from stage 1
seg_a = seg_train_df_1[seg_train_df_1['seg_cluster'] != -1].copy()

# clustered from stage 2
seg_b = seg_train_df_2[seg_train_df_2['seg_cluster'] != -1].copy()
prev_max = seg_train_df_1[seg_train_df_1['seg_cluster']
                          != -1]['seg_cluster'].max()
seg_b['seg_cluster'] = seg_b['seg_cluster'] + \
    prev_max + 1  # increment cluster number

# unclustered
seg_c = seg_train_df_2[seg_train_df_2['seg_cluster'] == -1].copy()
In [44]:
# update training data
seg_train_df = pd.concat([seg_a,seg_b,seg_c])
In [45]:
# visual inspect clusters by facet plot

# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html

g = sns.FacetGrid(seg_train_df_2, col='seg_cluster', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=0.1)#, edgecolor="w")

# cluster 6+12 = 18 is of interest
In [46]:
# visual inspect clusters by facet plot

# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html

g = sns.FacetGrid(seg_train_df, col='seg_cluster', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=0.1)#, edgecolor="w")
In [47]:
# selecting only clusters of interests:
# -Cluster 8 + 9 (E-W road),
# -Cluster 3 (N-S road in the NW corner),
# -Cluster 18 (turning lane from South road to E-W road)

seg_train_df['seg_cluster_combined'] = seg_train_df['seg_cluster'].\
    apply(lambda x: 'A' if ((x == 'A') | (x == 8) | (x == 9))
          else ('B' if ((x == 'B') | (x == 3)) else
                'C' if ((x == 'C') | (x == 18))
                else 'Exclude'
                )
          )


# remove cluster to be excluded, assign combined cluster as final cluster

seg_train_df = seg_train_df[seg_train_df['seg_cluster_combined'].
                            isin(['A', 'B', 'C'])]
seg_train_df['seg_cluster'] = seg_train_df['seg_cluster_combined']
seg_train_df = seg_train_df.drop(columns=['seg_cluster_combined'])
seg_train_df.groupby(['seg_cluster']).count()
C:\Users\bwen\Anaconda3\envs\uas4t\lib\site-packages\ipykernel_launcher.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Out[47]:
x y count bearing total_count count_pct
seg_cluster
A 16334 16334 16334 16334 16334 16334
B 2790 2790 2790 2790 2790 2790
C 696 696 696 696 696 696
In [48]:
# visual inspect clusters by map - color by clusters

# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
# https://datashader.org/getting_started/Interactivity.html
from datashader.colors import Sets1to3
# https://github.com/holoviz/datashader/issues/767
import colorcet as cc
long_key = list(set(cc.glasbey_cool + cc.glasbey_warm + cc.glasbey_dark))

df = seg_train_df#[seg_train_df['seg_cluster']>=0].copy()
# df = seg_train_df[seg_train_df['seg_cluster']==0].copy()
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 
tiles.CartoLight() * hd.datashade(points, 
             aggregator=ds.count_cat('seg_cluster'), 
             color_key=Sets1to3).opts(hv.opts(width=750, height=350))
#tiles.CartoLight() * hd.dynspread(hd.datashade(points, 
#             aggregator=ds.count_cat('seg_cluster'), 
#             color_key=long_key).opts(hv.opts(width=750, height=350)), threshold=0.4)
Out[48]:

Post-processing for un-clustered points

recover some nearby points not classified

un-clustered points

In [49]:
# reassignment part A for unclustered points

# approach #1
# - every point within an existing cluster is used as a core point for cluster reassignment
# - this approach require a lot more distance computations

seg_train_df_0 = seg_train_df[seg_train_df['seg_cluster'] == -1].\
    reset_index(drop=False).\
    rename(columns={'u_id': 'u_id'})
seg_train_df_1 = seg_train_df[seg_train_df['seg_cluster'] != -1].\
    reset_index(drop=False).\
    rename(columns={'u_id': 'u_id_clustered'})

seg_train_df_0 = seg_train_df_0.drop(columns=['seg_cluster'])
seg_train_df_1 = seg_train_df_1.\
    rename(columns={'x': 'x_clustered', 'y': 'y_clustered'}).\
    drop(columns=['count', 'bearing', 'total_count', 'count_pct'])

seg_train_df_0['tmp'] = 1
seg_train_df_1['tmp'] = 1
In [50]:
len(seg_train_df_0)
Out[50]:
0
In [51]:
len(seg_train_df_1)
Out[51]:
19820
In [52]:
# build intermediate dataframe
# https://stackoverflow.com/questions/35234012/python-pandas-merge-two-tables-without-keys-multiply-2-dataframes-with-broadc
seg_train_df_reassign_a = pd.merge(seg_train_df_0, seg_train_df_1, on=['tmp']).drop(columns='tmp')
In [53]:
# calculate Euclidean distance
# more resources for more complex examples: https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/
def e_dist(x1, x2, y1, y2):
    return np.sqrt((x1-x2) ** 2+(y1-y2) ** 2)


df = seg_train_df_reassign_a

df['dist'] = e_dist(
    x1=df['x_clustered'],
    x2=df['x'],
    y1=df['y_clustered'],
    y2=df['y'])

# get minimum distance in each group
idx = df.groupby(['u_id'])['dist'].transform(min) == df['dist']

# save results
seg_reassigned_df_a = df.copy()
seg_reassigned_idx_a = idx
In [54]:
# limit on reassigning unclustered points
reassign_dist_limit = 20 # meters

seg_unclustered_df = seg_reassigned_df_a[seg_reassigned_idx_a]
# limit max distance to 20 meters
seg_unclustered_df = seg_unclustered_df[seg_unclustered_df['dist'] < reassign_dist_limit]
seg_unclustered_df = seg_unclustered_df.set_index('u_id')
seg_unclustered_df = seg_unclustered_df[list(seg_train_df.columns)]
seg_unclustered_df
Out[54]:
x y count bearing total_count count_pct seg_cluster
u_id
In [55]:
len(seg_train_df)
Out[55]:
19820
In [56]:
seg_train_df_final = pd.concat(
    [seg_train_df[seg_train_df['seg_cluster'] != -1], seg_unclustered_df])

seg_train_df_final
Out[56]:
x y count bearing total_count count_pct seg_cluster
u_id
168654 2.641740e+06 4.578292e+06 15615 90.0 6322772 0.246964 B
172134 2.641741e+06 4.578293e+06 27 90.0 6322772 0.000427 B
174260 2.641741e+06 4.578294e+06 55 90.0 6322772 0.000870 B
183626 2.641743e+06 4.578278e+06 14 90.0 6322772 0.000221 B
183627 2.641743e+06 4.578279e+06 1119 90.0 6322772 0.017698 B
... ... ... ... ... ... ... ...
664308 2.641883e+06 4.578223e+06 224 90.0 6322772 0.003543 C
666456 2.641884e+06 4.578222e+06 262 90.0 6322772 0.004144 C
674803 2.641888e+06 4.578224e+06 13 90.0 6322772 0.000206 C
675050 2.641888e+06 4.578224e+06 298 90.0 6322772 0.004713 C
676427 2.641889e+06 4.578224e+06 197 90.0 6322772 0.003116 C

19820 rows × 7 columns

A quick look at convex hull with the clustered and unclusted points
In [57]:
# build convex hull to recapture raw gps points
# https://stackoverflow.com/questions/60194404/how-to-make-a-polygon-shapefile-which-corresponds-to-the-outer-boundary-of-the-g
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.ConvexHull.html

# # scipy convex hull example

# from scipy.spatial import ConvexHull, convex_hull_plot_2d
# import matplotlib.pyplot as plt

# # hull 1
# points = np.random.rand(30, 2)   # 30 random points in 2-D
# hull = ConvexHull(points)
# plt.plot(points[:,0], points[:,1], 'o')
# for simplex in hull.simplices:
#     plt.plot(points[simplex, 0], points[simplex, 1], 'k-')
    
# # hull 2
# points = np.random.rand(30, 2)   # 30 random points in 2-D
# plt.plot(points[:,0], points[:,1], 'o')
# hull = ConvexHull(points)
# for simplex in hull.simplices:
#     plt.plot(points[simplex, 0], points[simplex, 1], 'k-')
In [58]:
# taking a look at convex hull without unclustered points
# this can be thought of as congested areas

df = seg_train_df[seg_train_df['seg_cluster'] != -1].\
    reset_index(drop=False).\
    rename(columns={'u_id': 'u_id_clustered'})

# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_cluster'].unique():
    cluster_pt_dict[cluster] = df[
        df['seg_cluster'] == cluster][['x', 'y']].to_numpy()


def get_convex_hull_indices(pts_array):
    hull = ConvexHull(pts_array)
    hull_indices = np.unique(hull.simplices.flat)
    hull_pts = pts_array[hull_indices, :]
    return(hull_pts)


# get convex hull
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
    cluster_hull_dict[cluster] = get_convex_hull_indices(
        cluster_pt_dict[cluster])


# plot
for cluster in list(cluster_pt_dict.keys()):
    plt.plot(cluster_pt_dict[cluster][:, 0],
             cluster_pt_dict[cluster][:, 1], ',')
    hull = ConvexHull(cluster_pt_dict[cluster])
    for simplex in hull.simplices:
        plt.plot(cluster_pt_dict[cluster][simplex, 0],
                 cluster_pt_dict[cluster][simplex, 1], 'k-')
In [59]:
# taking a look at convex hull with unclustered points
# this can be viewed as extended congested areas

df = seg_train_df_final.copy()

# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_cluster'].unique():
    cluster_pt_dict[cluster] = df[
        df['seg_cluster'] == cluster][['x', 'y']].to_numpy()


def get_convex_hull_indices(pts_array):
    hull = ConvexHull(pts_array)
    hull_indices = np.unique(hull.simplices.flat)
    hull_pts = pts_array[hull_indices, :]
    return(hull_pts)


# get convex hull objects
cluster_hull_objs = {}
for cluster in list(cluster_pt_dict.keys()):
    cluster_hull_objs[cluster] = ConvexHull(cluster_pt_dict[cluster])

# get convex hull indice points
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
    cluster_hull_dict[cluster] = get_convex_hull_indices(
        cluster_pt_dict[cluster])


# plot
for cluster in list(cluster_pt_dict.keys()):
    plt.plot(cluster_pt_dict[cluster][:, 0],
             cluster_pt_dict[cluster][:, 1], ',')
    hull = ConvexHull(cluster_pt_dict[cluster])
    for simplex in hull.simplices:
        plt.plot(cluster_pt_dict[cluster][simplex, 0],
                 cluster_pt_dict[cluster][simplex, 1], 'k-')
In [60]:
# build a convex hull points df from the entire training set (incl. unclustered points)

cluster_hull_list_df = []
for cluster in list(cluster_hull_dict.keys()):
    label = cluster
    df = pd.DataFrame(cluster_hull_dict[cluster], columns=['x', 'y'])
    df['seg_cluster'] = label
    cluster_hull_list_df.append(df)
    
cluster_hull_df = pd.concat(cluster_hull_list_df)

cluster_hull_df
Out[60]:
x y seg_cluster
0 2.641740e+06 4.578292e+06 B
1 2.641743e+06 4.578278e+06 B
2 2.641751e+06 4.578263e+06 B
3 2.641752e+06 4.578367e+06 B
4 2.641752e+06 4.578263e+06 B
5 2.641752e+06 4.578369e+06 B
6 2.641767e+06 4.578263e+06 B
7 2.641769e+06 4.578406e+06 B
8 2.641783e+06 4.578407e+06 B
9 2.641787e+06 4.578292e+06 B
10 2.641789e+06 4.578319e+06 B
0 2.641783e+06 4.578260e+06 A
1 2.641783e+06 4.578260e+06 A
2 2.641783e+06 4.578265e+06 A
3 2.641789e+06 4.578257e+06 A
4 2.641789e+06 4.578268e+06 A
5 2.641790e+06 4.578268e+06 A
6 2.641796e+06 4.578267e+06 A
7 2.641836e+06 4.578235e+06 A
8 2.641840e+06 4.578234e+06 A
9 2.641940e+06 4.578239e+06 A
10 2.642083e+06 4.578211e+06 A
11 2.642150e+06 4.578195e+06 A
12 2.642156e+06 4.578193e+06 A
13 2.642159e+06 4.578184e+06 A
14 2.642162e+06 4.578188e+06 A
15 2.642162e+06 4.578188e+06 A
0 2.641776e+06 4.578218e+06 C
1 2.641776e+06 4.578219e+06 C
2 2.641780e+06 4.578213e+06 C
3 2.641781e+06 4.578213e+06 C
4 2.641811e+06 4.578245e+06 C
5 2.641816e+06 4.578246e+06 C
6 2.641854e+06 4.578238e+06 C
7 2.641870e+06 4.578213e+06 C
8 2.641875e+06 4.578233e+06 C
9 2.641878e+06 4.578232e+06 C
10 2.641889e+06 4.578224e+06 C

Apply Road Segment to all unique data points

In [61]:
# https://stackoverflow.com/questions/16750618/whats-an-efficient-way-to-find-if-a-point-lies-in-the-convex-hull-of-a-point-cl/16898636#16898636

def in_hull(p, hull):
    """
    Test if points in `p` are in `hull`

    `p` should be a `NxK` coordinates of `N` points in `K` dimensions
    `hull` is either a scipy.spatial.Delaunay object or the `MxK` array of the 
    coordinates of `M` points in `K`dimensions for which Delaunay triangulation
    will be computed
    """
    from scipy.spatial import Delaunay
    if not isinstance(hull, Delaunay):
        hull = Delaunay(hull)

    return hull.find_simplex(p) >= 0
In [62]:
# iterate over convex hull objects and match points

cluster_hull_objs.keys()
Out[62]:
dict_keys(['B', 'A', 'C'])
In [63]:
df = parsed_df.copy()

df['x_id'] = df['x'] * 100
df['x_id'] = df['x_id'].astype(int)
df['y_id'] = df['y'] * 100
df['y_id'] = df['y_id'].astype(int)

# save ids to parsed_df
parsed_df = df.copy()

df['count'] = 1

# get count and angle by unique location
df = df.\
    groupby(['x', 'y', 'x_id', 'y_id']).\
    agg({"count": np.sum, 'bearing': np.median}).\
    rename(columns={'bearing': 'bearing_median'}).\
    reset_index()
In [64]:
all_cluster_cols = []
cluster_keys = list(cluster_hull_dict.keys())
cluster_keys.sort()

for cluster_hull in cluster_keys:
    col_name = "cluster_{}".format(str(cluster_hull))
    all_cluster_cols.append(col_name)
    df[col_name] = in_hull(
        p=df[['x', 'y']].to_numpy(),
        hull=cluster_hull_dict[cluster_hull])
    df.loc[df[col_name]==True, 'seg_cluster'] = str(cluster_hull)
    df = df.drop(columns=[col_name])
In [65]:
# merge id table with name table

# use all points - allow duplicate identicals
# clustered_df = parsed_df.merge(
#     df.drop(columns=['x', 'y']), on=['x_id', 'y_id'])

# use only unique points - disallow identicals

seg_train_df_final = df.copy()

seg_train_df_final['seg_cluster'] = seg_train_df_final['seg_cluster'].astype(str)

Lane and Directional Sub-Clustering

(Instead of Directional due to restricted scope in analysis area)

In [66]:
seg_train_df_final_bk = seg_train_df_final.copy()
In [67]:
seg_train_df_final = seg_train_df_final_bk.copy()

# filter out infreq points (points with less than 2 samples) for training
# this helps reduce data size and introduce breaks in low density areas of the data
seg_train_df_final = seg_train_df_final[seg_train_df_final['count'] > 2]
seg_train_df_infre = seg_train_df_final[seg_train_df_final['count'] <= 2]
In [68]:
cluster_list = seg_train_df_final['seg_cluster'].unique()
# cluster_list = [1]
cluster_list
Out[68]:
array(['nan', 'B', 'C', 'A'], dtype=object)
In [69]:
seg_train_df_final = seg_train_df_final[seg_train_df_final['seg_cluster']!='nan']

len(seg_train_df_final)
Out[69]:
182735
In [70]:
cluster_list = seg_train_df_final['seg_cluster'].unique()
# cluster_list = [1]
cluster_list
Out[70]:
array(['B', 'C', 'A'], dtype=object)
In [71]:
# prepare data

seg_train_df_final_dict = dict((key, seg_train_df_final[seg_train_df_final['seg_cluster'] == key])
                               for key in cluster_list)
In [72]:
# # run subclustering for direction - kmeans


# subcluster_parameters = {
#     'A': {
#         'n_clusters': 4
#     },
#     'B': {
#         'n_clusters': 1
#     },
#     'C': {
#         'n_clusters': 3
#     }
# }


# subcluster_results = dict((key,
#                            cluster_kmeans(df=seg_train_df_final_dict[key],
#                                           n_clusters=subcluster_parameters[key]['n_clusters'],
#                                           feature_names=['bearing_median'],
#                                           label_name='dir_cluster')
#                            )
#                           for key in cluster_list)
In [73]:
# # # run subclustering for direction - hdbscan


# # define subclustering parameters

# subcluster_parameters = {
#     'A': {
#         'metric': 'euclidean',
#         'min_cluster_size': 1000,
#         'min_samples': 100,
#         'cluster_selection_epsilon': 1
#     },
#     'B': {
#         'metric': 'euclidean',
#         'min_cluster_size': 1000,
#         'min_samples': 100,
#         'cluster_selection_epsilon': 1
#     },
#     'C': {
#         'metric': 'euclidean',
#         'min_cluster_size': 1000,
#         'min_samples': 100,
#         'cluster_selection_epsilon': 1
#     }
# }


# # run subclustering for lanes
# subcluster_results = dict((key,
#                            cluster_hdbscan(df=seg_train_df_final_dict[key],
#                                            parameters=subcluster_parameters[key],
#                                            feature_names=['x', 'y'],
#                                            label_name='dir_cluster')
#                            )
#                           for key in cluster_list)
In [74]:
lane_parameter = {
    'A': {
        'min_samples': 100,
        'cluster_selection_epsilon': 1
    },
    'B': {
        'min_samples': 100,
        'cluster_selection_epsilon': 1
    },
    'C': {
        'min_samples': 50,
        'cluster_selection_epsilon': 1
    }
}

subcluster_results = dict((key, cluster_dbscan(df=seg_train_df_final_dict[key],
                                               parameters=lane_parameter[key],
                                               feature_names=['x', 'y'],
                                               label_name='dir_cluster',
                                               verbose=False)) for key in cluster_list)
In [75]:
subcluster_results_df = pd.concat(list(subcluster_results.values()))

# # filter out "outliers" within the cluster
# subcluster_results_df = subcluster_results_df[subcluster_results_df['lane_subcluster']!=-1]

subcluster_results_df['seg_dir_cluster'] = subcluster_results_df['seg_cluster'].astype(
    str) + "_" + subcluster_results_df['dir_cluster'].astype(str)
In [76]:
len(subcluster_results_df)
Out[76]:
182735
In [77]:
min_cluster_size = 150 # for seg dir cluster, if not met, cluster is deleted

checksum = subcluster_results_df.groupby(['seg_dir_cluster']).count()
exclude = checksum[checksum<min_cluster_size].dropna().reset_index()

# exclude
subcluster_results_df = subcluster_results_df[~subcluster_results_df['seg_dir_cluster'].isin(
    exclude['seg_dir_cluster'])].copy()

exclude
Out[77]:
seg_dir_cluster x y x_id y_id count bearing_median seg_cluster dir_cluster
0 A_13 90.0 90.0 90.0 90.0 90.0 90.0 90.0 90.0
1 A_14 53.0 53.0 53.0 53.0 53.0 53.0 53.0 53.0
2 A_4 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0
3 A_5 129.0 129.0 129.0 129.0 129.0 129.0 129.0 129.0
4 A_6 63.0 63.0 63.0 63.0 63.0 63.0 63.0 63.0
In [78]:
len(subcluster_results_df)
Out[78]:
182300
In [79]:
# taking a look at convex hull with directions
# this can be viewed as extended congested areas

df = subcluster_results_df.copy()

# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_dir_cluster'].unique():
    cluster_pt_dict[cluster] = df[
        df['seg_dir_cluster'] == cluster][['x', 'y']].to_numpy()


def get_convex_hull_indices(pts_array):
    hull = ConvexHull(pts_array)
    hull_indices = np.unique(hull.simplices.flat)
    hull_pts = pts_array[hull_indices, :]
    return(hull_pts)


# get convex hull objects
cluster_hull_objs = {}
for cluster in list(cluster_pt_dict.keys()):
    cluster_hull_objs[cluster] = ConvexHull(cluster_pt_dict[cluster])

# get convex hull indice points
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
    cluster_hull_dict[cluster] = get_convex_hull_indices(
        cluster_pt_dict[cluster])


# plot
for cluster in list(cluster_pt_dict.keys()):
    plt.plot(cluster_pt_dict[cluster][:, 0],
             cluster_pt_dict[cluster][:, 1], ',')
    hull = ConvexHull(cluster_pt_dict[cluster])
    for simplex in hull.simplices:
        plt.plot(cluster_pt_dict[cluster][simplex, 0],
                 cluster_pt_dict[cluster][simplex, 1], 'k-')
In [80]:
# check size of clusters

subcluster_results_df.groupby('seg_dir_cluster').count()
Out[80]:
x y x_id y_id count bearing_median seg_cluster dir_cluster
seg_dir_cluster
A_-1 23095 23095 23095 23095 23095 23095 23095 23095
A_0 1674 1674 1674 1674 1674 1674 1674 1674
A_1 487 487 487 487 487 487 487 487
A_10 5555 5555 5555 5555 5555 5555 5555 5555
A_11 242 242 242 242 242 242 242 242
A_12 160 160 160 160 160 160 160 160
A_15 3076 3076 3076 3076 3076 3076 3076 3076
A_16 327 327 327 327 327 327 327 327
A_17 191 191 191 191 191 191 191 191
A_18 273 273 273 273 273 273 273 273
A_2 1524 1524 1524 1524 1524 1524 1524 1524
A_3 28014 28014 28014 28014 28014 28014 28014 28014
A_7 152 152 152 152 152 152 152 152
A_8 16651 16651 16651 16651 16651 16651 16651 16651
A_9 24434 24434 24434 24434 24434 24434 24434 24434
B_-1 12104 12104 12104 12104 12104 12104 12104 12104
B_0 13011 13011 13011 13011 13011 13011 13011 13011
B_1 199 199 199 199 199 199 199 199
B_2 8135 8135 8135 8135 8135 8135 8135 8135
B_3 281 281 281 281 281 281 281 281
B_4 244 244 244 244 244 244 244 244
B_5 2421 2421 2421 2421 2421 2421 2421 2421
B_6 12328 12328 12328 12328 12328 12328 12328 12328
C_-1 773 773 773 773 773 773 773 773
C_0 2101 2101 2101 2101 2101 2101 2101 2101
C_1 1740 1740 1740 1740 1740 1740 1740 1740
C_2 600 600 600 600 600 600 600 600
C_3 22508 22508 22508 22508 22508 22508 22508 22508
In [81]:
points.array()[0]
Out[81]:
array([2641740.31317523, 4578292.070285046, 15615, 90.0, 6322772,
       0.24696446432039618, 'B'], dtype=object)
In [82]:
# build seg_dir_lane_cluster from visual inspection
# this effectively clean up the clustering result and get rid of clusters that aren't meaningful

subcluster_results_df['seg_dir_lane_cluster'] = subcluster_results_df['seg_dir_cluster'].\
    apply(lambda x:
          'Green_1' if ((x == 'B_3') | (x == 'B_4') | (x == 'B_5') | (x == 'B_6'))
          else (
              'Green_2' if ((x == 'B_0'))
              else (
                  'Green_3' if ((x == 'B_1') | (x == 'B_2'))
                  else(
                      'Yellow_1' if ((x == 'C_0'))
                      else (
                          'Yellow_2' if ((x == 'C_1'))
                          else (
                              'Yellow_3' if ((x == 'C_2'))
                              else (
                                  'Red_1' if ((x == 'A_0') | (x == 'A_3'))
                                  else (
                                      'Red_2' if ((x == 'A_1') | (x == 'A_7') | (x == 'A_8') | (x == 'A_15'))
                                      else (
                                          'Red_3' if ((x == 'A_2') | (x == 'A_9'))
                                          else(
                                              'Exclude'
                                          )
                                      )
                                  )
                              )
                          )
                      )
                  )
              )
          ))

subcluster_results_df = subcluster_results_df[subcluster_results_df['seg_dir_lane_cluster']!='Exclude']
In [83]:
# visual inspect clusters by map - color by clusters

# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources

from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
# https://datashader.org/getting_started/Interactivity.html
from datashader.colors import Sets1to3
# https://github.com/holoviz/datashader/issues/767
import colorcet as cc
long_key = list(set(cc.glasbey_cool + cc.glasbey_warm + cc.glasbey_dark))

df = subcluster_results_df#[seg_train_df['seg_cluster']>=0].copy()
# df = subcluster_results_df[subcluster_results_df['dir_cluster']>=0].copy()
# df = subcluster_results_df[subcluster_results_df['seg_dir_cluster'].isin(['A_0', 'A_1', 'A_2', 'A_3', 'A_4', 'A_5', 'A_6', 'A_7', 'A_8', 'A_9', 'A_10', 'A_11', 'A_12', 'A_13', 'A_14', 'A_15', 'A_16', 'A_17', 'A_18'])].copy()
# df = subcluster_results_df[subcluster_results_df['seg_dir_cluster'].isin(['A_0', 'A_3', 'A_10'])].copy()
# df = seg_train_df[seg_train_df['seg_cluster']==0].copy()
points = hv.Points(df.copy())

hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() * 

tiles.CartoLight() * hd.datashade(points, 
             aggregator=ds.count_cat('seg_dir_lane_cluster'), 
             color_key=long_key).opts(hv.opts(width=750, height=350)) #* color_points
#tiles.CartoLight() * hd.dynspread(hd.datashade(points, 
#             aggregator=ds.count_cat('seg_cluster'), 
#             color_key=long_key).opts(hv.opts(width=750, height=350)), threshold=0.4)
Out[83]:
In [84]:
# visual inspect clusters by facet plot

# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html
# subcluster_results_df['tmp'] = 1
g = sns.FacetGrid(
    subcluster_results_df,
    hue='seg_dir_lane_cluster',
    col_wrap=5,
    height=4,
    legend_out=True,
    #     col='tmp'
    col='seg_dir_lane_cluster'
)
g = g.map(plt.scatter, 'x', 'y', s=0.05, marker='.')  # , edgecolor="w")

Megre results with full parsed data

In [85]:
# taking a look at convex hull with lane and directions
# this can be viewed as extended congested areas

df = subcluster_results_df.copy()

# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_dir_lane_cluster'].unique():
    cluster_pt_dict[cluster] = df[
        df['seg_dir_lane_cluster'] == cluster][['x', 'y']].to_numpy()


def get_convex_hull_indices(pts_array):
    hull = ConvexHull(pts_array)
    hull_indices = np.unique(hull.simplices.flat)
    hull_pts = pts_array[hull_indices, :]
    return(hull_pts)


# get convex hull objects
cluster_hull_objs = {}
for cluster in list(cluster_pt_dict.keys()):
    cluster_hull_objs[cluster] = ConvexHull(cluster_pt_dict[cluster])

# get convex hull indice points
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
    cluster_hull_dict[cluster] = get_convex_hull_indices(
        cluster_pt_dict[cluster])


# plot
for cluster in list(cluster_pt_dict.keys()):
    plt.plot(cluster_pt_dict[cluster][:, 0],
             cluster_pt_dict[cluster][:, 1], ',')
    hull = ConvexHull(cluster_pt_dict[cluster])
    for simplex in hull.simplices:
        plt.plot(cluster_pt_dict[cluster][simplex, 0],
                 cluster_pt_dict[cluster][simplex, 1], 'k-')
In [86]:
# https://stackoverflow.com/questions/16750618/whats-an-efficient-way-to-find-if-a-point-lies-in-the-convex-hull-of-a-point-cl/16898636#16898636

def in_hull(p, hull):
    """
    Test if points in `p` are in `hull`

    `p` should be a `NxK` coordinates of `N` points in `K` dimensions
    `hull` is either a scipy.spatial.Delaunay object or the `MxK` array of the 
    coordinates of `M` points in `K`dimensions for which Delaunay triangulation
    will be computed
    """
    from scipy.spatial import Delaunay
    if not isinstance(hull, Delaunay):
        hull = Delaunay(hull)

    return hull.find_simplex(p) >= 0
In [87]:
# iterate over convex hull objects and match points

cluster_hull_objs.keys()
Out[87]:
dict_keys(['Green_2', 'Green_3', 'Green_1', 'Yellow_1', 'Yellow_2', 'Yellow_3', 'Red_1', 'Red_2', 'Red_3'])
In [88]:
df = parsed_df.copy()

df['x_id'] = df['x'] * 100
df['x_id'] = df['x_id'].astype(int)
df['y_id'] = df['y'] * 100
df['y_id'] = df['y_id'].astype(int)

# save ids to parsed_df
parsed_df = df.copy()

df['count'] = 1

# get count and angle by unique location
df = df.\
    groupby(['x', 'y', 'x_id', 'y_id']).\
    agg({"count": np.sum, 'bearing': np.median}).\
    rename(columns={'bearing': 'bearing_median'}).\
    reset_index()
In [89]:
all_cluster_cols = []
cluster_keys = list(cluster_hull_dict.keys())
cluster_keys.sort()

for cluster_hull in cluster_keys:
    col_name = "cluster_{}".format(str(cluster_hull))
    all_cluster_cols.append(col_name)
    df[col_name] = in_hull(
        p=df[['x', 'y']].to_numpy(),
        hull=cluster_hull_dict[cluster_hull])
    df.loc[df[col_name]==True, 'seg_dir_lane_cluster'] = str(cluster_hull)
    df = df.drop(columns=[col_name])
In [90]:
# merge id table with name table

# use all points - allow duplicate identicals
# clustered_df = parsed_df.merge(
#     df.drop(columns=['x', 'y']), on=['x_id', 'y_id'])

# use only unique points - disallow identicals

subcluster_results_df = df.copy()

subcluster_results_df['seg_dir_lane_cluster'] = subcluster_results_df['seg_dir_lane_cluster'].astype(str)
In [91]:
# remove nan

subcluster_results_df = subcluster_results_df[subcluster_results_df['seg_dir_lane_cluster']!='nan']

Merge seg_dir_lane_cluster with all applicable data points

In [92]:
df = subcluster_results_df.copy()

df['x_id'] = df['x'] * 100
df['x_id'] = df['x_id'].astype(int)
df['y_id'] = df['y'] * 100
df['y_id'] = df['y_id'].astype(int)


subcluster_results_df = df[['x_id', 'y_id', 'seg_dir_lane_cluster']].copy()
subcluster_results_df
Out[92]:
x_id y_id seg_dir_lane_cluster
232372 264175033 457826551 Green_2
233072 264175044 457826523 Green_2
233073 264175044 457826537 Green_2
233074 264175044 457826551 Green_2
233075 264175044 457826565 Green_2
... ... ... ...
1068784 264216176 457818811 Red_1
1068880 264216188 457818782 Red_1
1068881 264216188 457818796 Red_1
1068882 264216188 457818811 Red_1
1068983 264216199 457818782 Red_1

261555 rows × 3 columns

In [93]:
clustered_df = parsed_df.merge(
    subcluster_results_df, on=['x_id', 'y_id']).\
    sort_values(by='record_id', axis=0)  # make sure record is in order
clustered_df
Out[93]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc ... appr_acc_angle x y x_lag y_lag bearing bearing_lead x_id y_id seg_dir_lane_cluster
0 73613 54 Car 466.01 14.462239 37.991838 23.731369 25.3502 -1.8672 -1.5358 ... -140.0 2.641764e+06 4.578273e+06 2.641764e+06 4.578272e+06 345.281004 NaN 264176391 457827286 Green_1
10 73614 54 Car 466.01 14.462239 37.991835 23.731370 25.0772 -1.9241 -1.6781 ... -140.0 2.641764e+06 4.578272e+06 2.641764e+06 4.578272e+06 338.493117 345.281004 264176402 457827243 Green_1
18 73615 54 Car 466.01 14.462239 37.991833 23.731371 24.7881 -2.0911 -1.7978 ... -140.0 2.641764e+06 4.578272e+06 2.641764e+06 4.578272e+06 345.281003 338.493117 264176413 457827215 Green_1
29 73616 54 Car 466.01 14.462239 37.991830 23.731372 24.4859 -2.1063 -1.8906 ... -140.0 2.641764e+06 4.578272e+06 2.641764e+06 4.578271e+06 345.281002 345.281003 264176424 457827173 Green_1
42 73617 54 Car 466.01 14.462239 37.991827 23.731373 24.1934 -1.9562 -1.9539 ... -140.0 2.641764e+06 4.578271e+06 2.641764e+06 4.578271e+06 0.000000 345.281002 264176435 457827130 Green_1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1680077 6322750 2305 Car 3.52 21.109554 37.992764 23.731498 22.6174 0.0000 0.0000 ... 0.0 2.641778e+06 4.578404e+06 2.641778e+06 4.578403e+06 21.506635 0.000000 264177827 457840365 Green_2
963926 6322751 2305 Car 3.52 21.109554 37.992762 23.731497 22.6174 0.0000 0.0000 ... 0.0 2.641778e+06 4.578403e+06 2.641778e+06 4.578403e+06 0.000000 21.506635 264177816 457840337 Green_2
963930 6322752 2305 Car 3.52 21.109554 37.992760 23.731497 22.6174 0.0000 0.0000 ... 0.0 2.641778e+06 4.578403e+06 2.641778e+06 4.578403e+06 0.000000 0.000000 264177816 457840309 Green_2
1860947 6322753 2305 Car 3.52 21.109554 37.992757 23.731497 22.6174 0.0000 0.0000 ... 0.0 2.641778e+06 4.578403e+06 2.641778e+06 4.578402e+06 21.506637 0.000000 264177816 457840267 Green_2
1664775 6322754 2305 Car 3.52 21.109554 37.992755 23.731496 22.6174 0.0000 0.0000 ... 0.0 2.641778e+06 4.578402e+06 NaN NaN 21.506637 21.506637 264177805 457840238 Green_2

1868799 rows × 22 columns

In [94]:
clustered_df.groupby(['seg_dir_lane_cluster']).count()
Out[94]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc ... acc_angle appr_acc_angle x y x_lag y_lag bearing bearing_lead x_id y_id
seg_dir_lane_cluster
Green_1 218589 218589 218589 218589 218589 218589 218589 218589 218589 218589 ... 218589 218589 218589 218589 218559 218559 218589 218473 218589 218589
Green_2 131615 131615 131615 131615 131615 131615 131615 131615 131615 131615 ... 131615 131615 131615 131615 131606 131606 131615 131493 131615 131615
Green_3 76491 76491 76491 76491 76491 76491 76491 76491 76491 76491 ... 76491 76491 76491 76491 76486 76486 76491 76463 76491 76491
Red_1 521627 521627 521627 521627 521627 521627 521627 521627 521627 521627 ... 521627 521627 521627 521627 521594 521594 521627 521594 521627 521627
Red_2 502597 502597 502597 502597 502597 502597 502597 502597 502597 502597 ... 502597 502597 502597 502597 502569 502569 502597 502573 502597 502597
Red_3 365025 365025 365025 365025 365025 365025 365025 365025 365025 365025 ... 365025 365025 365025 365025 365001 365001 365025 365009 365025 365025
Yellow_1 25567 25567 25567 25567 25567 25567 25567 25567 25567 25567 ... 25567 25567 25567 25567 25567 25567 25567 25567 25567 25567
Yellow_2 19667 19667 19667 19667 19667 19667 19667 19667 19667 19667 ... 19667 19667 19667 19667 19666 19666 19667 19666 19667 19667
Yellow_3 7621 7621 7621 7621 7621 7621 7621 7621 7621 7621 ... 7621 7621 7621 7621 7621 7621 7621 7621 7621 7621

9 rows × 21 columns

In [95]:
########### End of Clustering Models for Roadway Geometries ###########

Calculating Congestion "Clusters" Results

In [96]:
# for each cluster, and time step
# cluster queues based on DBSCAN, set a avg speed eligibility ~ 10kph (no tailgating) - if a whole set of points are fast but close, ignore
# 
# find queue length based on furthest point algorithm function - these points are start and end of queues
# 

Prepare data

In [97]:
# define speed threshold
speed_threshold = 10
In [98]:
import math

# create time bin for the clustered df data

# calculate time bin based on max and min values, then do every x seconds

x_sec_bin = 0.02  # step size - shouldn't be too large, if 0.02, no bin

min_time = min(clustered_df['time'])
max_time = max(clustered_df['time'])

if(x_sec_bin <= 0.02):
    clustered_df['time_bin'] = clustered_df['time'].copy()
else:
    clustered_df['time_bin'] = x_sec_bin * \
        np.round(clustered_df['time']/x_sec_bin, 0)
In [99]:
# a quick analysis on the count by time bins

clustered_df['count'] = 1

cluster_time_df = clustered_df[clustered_df['speed'] < speed_threshold].\
    groupby(['seg_dir_lane_cluster', 'time_bin']).\
    agg({'count': np.sum}).\
    reset_index()

# cluster_time_df = cluster_time_df[cluster_time_df['count'] > 1]

# for testing, use whole seconds only
# wholoe_seconds_only = ~(cluster_time_df['time'].astype(int) < cluster_time_df['time'])
# cluster_time_df = cluster_time_df[wholoe_seconds_only]

# cluster_time_list = cluster_time_df.\
#     drop(columns=['count']).\
#     to_numpy()

# # len(cluster_time_list)
# cluster_time_df[cluster_time_df['seg_dir_lane_cluster'] == '7_3']  # ['count'].

max(cluster_time_df['count'])
Out[99]:
30
In [100]:
min(cluster_time_df['count']) # let's get rid of these groups, they won't have any queues
Out[100]:
1
In [101]:
len(clustered_df)
Out[101]:
1868799
In [102]:
clustered_df_eval = clustered_df.merge(cluster_time_df.rename(
    columns={'count': 'time_bin_count'}), on=['seg_dir_lane_cluster', 'time_bin'])

# exclude cluster and time points with no more than 1 sample
clustered_df_eval = clustered_df_eval[clustered_df_eval['time_bin_count'] > 1]

# exclude points that are moving faster than 10 kph
clustered_df_eval = clustered_df_eval[clustered_df_eval['speed'] < speed_threshold]
In [103]:
clustered_df_eval.groupby(['seg_dir_lane_cluster']).count()
Out[103]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc ... y x_lag y_lag bearing bearing_lead x_id y_id time_bin count time_bin_count
seg_dir_lane_cluster
Green_1 119789 119789 119789 119789 119789 119789 119789 119789 119789 119789 ... 119789 119782 119782 119789 119782 119789 119789 119789 119789 119789
Green_2 53456 53456 53456 53456 53456 53456 53456 53456 53456 53456 ... 53456 53456 53456 53456 53455 53456 53456 53456 53456 53456
Green_3 26041 26041 26041 26041 26041 26041 26041 26041 26041 26041 ... 26041 26041 26041 26041 26041 26041 26041 26041 26041 26041
Red_1 317549 317549 317549 317549 317549 317549 317549 317549 317549 317549 ... 317549 317526 317526 317549 317533 317549 317549 317549 317549 317549
Red_2 380883 380883 380883 380883 380883 380883 380883 380883 380883 380883 ... 380883 380861 380861 380883 380867 380883 380883 380883 380883 380883
Red_3 213742 213742 213742 213742 213742 213742 213742 213742 213742 213742 ... 213742 213725 213725 213742 213739 213742 213742 213742 213742 213742
Yellow_1 10535 10535 10535 10535 10535 10535 10535 10535 10535 10535 ... 10535 10535 10535 10535 10535 10535 10535 10535 10535 10535
Yellow_2 7109 7109 7109 7109 7109 7109 7109 7109 7109 7109 ... 7109 7109 7109 7109 7109 7109 7109 7109 7109 7109
Yellow_3 3292 3292 3292 3292 3292 3292 3292 3292 3292 3292 ... 3292 3292 3292 3292 3292 3292 3292 3292 3292 3292

9 rows × 24 columns

In [104]:
# [x for i, x in df.groupby(level=0, sort=False)]

cluster_df_eval_list = [x for i, x in clustered_df_eval.groupby(['seg_dir_lane_cluster', 'time_bin'], sort=False)]

Test model parameter

In [105]:
cluster_df_eval_list[0]['time']
Out[105]:
5    0.0
8    0.0
Name: time, dtype: float64
In [106]:
# congestion_parameter = {
#     'metric': 'euclidean',
#     'min_cluster_size': 2,
#     'min_samples': 2,
#     'cluster_selection_epsilon': 15
# }

# df = cluster_hdbscan(df=cluster_df_eval_list[96],
#                      parameters=congestion_parameter,
#                      feature_names=['x', 'y'],
#                      label_name='cong_flag',
#                      verbose=False)

congestion_parameter = {
    'min_samples': 2,
    'cluster_selection_epsilon': 20
}

df = cluster_dbscan(df=cluster_df_eval_list[96],
                    parameters=congestion_parameter,
                    feature_names=['x', 'y'],
                    label_name='cong_flag')

df
dbscan trained on: {'min_samples': 2, 'cluster_selection_epsilon': 20}
Out[106]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc ... y_lag bearing bearing_lead x_id y_id seg_dir_lane_cluster time_bin count time_bin_count cong_flag
2194 172900 112 Heavy Vehicle 251.30 5.905119 37.991740 23.731606 0.0000 0.0000 0.0000 ... 4.578259e+06 90.0 90.0 264179029 457825901 Red_2 68.36 1 18 0
2195 185709 116 Car 332.58 6.985240 37.991723 23.731680 0.0000 0.0000 0.0000 ... 4.578257e+06 90.0 90.0 264179853 457825661 Red_2 68.36 1 18 0
2196 189995 117 Heavy Vehicle 266.46 6.010437 37.991602 23.732460 0.0000 0.0000 0.0000 ... 4.578240e+06 90.0 90.0 264188536 457823952 Red_2 68.36 1 18 1
2197 193986 118 Car 272.78 5.141408 37.991585 23.732546 0.0000 -0.0000 -0.0000 ... 4.578237e+06 90.0 90.0 264189493 457823712 Red_2 68.36 1 18 1
2198 203309 121 Car 277.65 5.152253 37.991577 23.732608 0.0000 0.0000 0.0000 ... 4.578236e+06 90.0 90.0 264190183 457823599 Red_2 68.36 1 18 1
2199 216579 126 Car 289.64 5.303674 37.991567 23.732685 0.0000 0.0000 0.0000 ... 4.578235e+06 90.0 90.0 264191040 457823458 Red_2 68.36 1 18 1
2200 227411 128 Car 369.30 7.477298 37.991553 23.732745 0.0000 0.0000 0.0000 ... 4.578233e+06 90.0 90.0 264191708 457823260 Red_2 68.36 1 18 1
2201 235644 131 Car 377.92 7.634648 37.991545 23.732824 0.0000 0.0000 0.0000 ... 4.578231e+06 90.0 90.0 264192588 457823147 Red_2 68.36 1 18 1
2202 245128 135 Car 385.16 7.686098 37.991528 23.732882 0.0000 0.0000 0.0000 ... 4.578229e+06 90.0 90.0 264193233 457822907 Red_2 68.36 1 18 1
2203 260832 139 Car 413.98 5.688282 37.991517 23.732944 0.0000 0.0000 0.0000 ... 4.578228e+06 90.0 90.0 264193924 457822751 Red_2 68.36 1 18 1
2204 326167 159 Car 539.97 7.363294 37.991511 23.732987 0.0000 0.0000 0.0000 ... 4.578227e+06 90.0 90.0 264194402 457822667 Red_2 68.36 1 18 1
2205 339877 164 Car 501.63 6.440304 37.991509 23.733041 0.0000 0.0000 0.0000 ... 4.578226e+06 90.0 90.0 264195003 457822638 Red_2 68.36 1 18 1
2206 361316 173 Car 652.52 8.661803 37.991485 23.733180 3.7779 -0.8890 -0.3893 ... 4.578223e+06 90.0 90.0 264196551 457822299 Red_2 68.36 1 18 1
2207 814506 288 Car 580.10 7.533779 37.991504 23.733109 0.0000 0.0000 0.0000 ... 4.578226e+06 90.0 90.0 264195760 457822568 Red_2 68.36 1 18 1
2208 905806 313 Car 656.59 8.879508 37.991322 23.734230 0.0000 0.0000 0.0000 ... 4.578200e+06 90.0 90.0 264208239 457819997 Red_2 68.36 1 18 -1
2209 914974 316 Car 582.24 7.502003 37.991465 23.733301 2.2052 -0.1023 0.1054 ... 4.578220e+06 90.0 90.0 264197898 457822017 Red_2 68.36 1 18 1
2210 946937 324 Car 581.19 7.477742 37.991450 23.733368 3.9416 0.1485 0.1153 ... 4.578218e+06 180.0 90.0 264198644 457821805 Red_2 68.36 1 18 1
2211 985971 338 Motorcycle 655.79 13.568029 37.991581 23.732641 9.6861 0.6768 0.1242 ... 4.578237e+06 90.0 90.0 264190551 457823655 Red_2 68.36 1 18 1

18 rows × 26 columns

In [107]:
g = sns.FacetGrid(df, col='cong_flag', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=10, marker='.')#, edgecolor="w")

Run Congestion Clustering with HDBSCAN

In [108]:
# run clustering for congestion for lanes

# congestion_parameter = {
#     'metric': 'euclidean',
#     'min_cluster_size': 2,
#     'min_samples': 2,
#     'cluster_selection_epsilon': 15
# }

# cong_cluster_df_eval_list = [(cluster_hdbscan(df=df,
#                                               parameters=congestion_parameter,
#                                               feature_names=['x', 'y'],
#                                               label_name='cong_flag',
#                                               verbose=False)
#                               )
#                              for df in cluster_df_eval_list]


congestion_parameter = {
    'min_samples': 2,
    'cluster_selection_epsilon': 20
}

cong_cluster_df_eval_list = [(cluster_dbscan(df=df,
                                             parameters=congestion_parameter,
                                             feature_names=['x', 'y'],
                                             label_name='cong_flag',
                                             verbose=False)
                              )
                             for df in cluster_df_eval_list]
In [109]:
# visual checks

g = sns.FacetGrid(cong_cluster_df_eval_list[60], col='cong_flag', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=10, marker='.')#, edgecolor="w")
In [110]:
# combine results from clustering

cong_cluster_df_result = pd.concat(cong_cluster_df_eval_list)
In [111]:
# remove all outliers (not dense enough to qualify as queues)

cong_cluster_df_result = cong_cluster_df_result[cong_cluster_df_result['cong_flag'] != -1]
In [112]:
cong_cluster_df_result.groupby(['seg_dir_lane_cluster']).count()
Out[112]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc ... x_lag y_lag bearing bearing_lead x_id y_id time_bin count time_bin_count cong_flag
seg_dir_lane_cluster
Green_1 119543 119543 119543 119543 119543 119543 119543 119543 119543 119543 ... 119537 119537 119543 119536 119543 119543 119543 119543 119543 119543
Green_2 53192 53192 53192 53192 53192 53192 53192 53192 53192 53192 ... 53192 53192 53192 53191 53192 53192 53192 53192 53192 53192
Green_3 25982 25982 25982 25982 25982 25982 25982 25982 25982 25982 ... 25982 25982 25982 25982 25982 25982 25982 25982 25982 25982
Red_1 309795 309795 309795 309795 309795 309795 309795 309795 309795 309795 ... 309779 309779 309795 309779 309795 309795 309795 309795 309795 309795
Red_2 377516 377516 377516 377516 377516 377516 377516 377516 377516 377516 ... 377495 377495 377516 377500 377516 377516 377516 377516 377516 377516
Red_3 199585 199585 199585 199585 199585 199585 199585 199585 199585 199585 ... 199573 199573 199585 199583 199585 199585 199585 199585 199585 199585
Yellow_1 10091 10091 10091 10091 10091 10091 10091 10091 10091 10091 ... 10091 10091 10091 10091 10091 10091 10091 10091 10091 10091
Yellow_2 7035 7035 7035 7035 7035 7035 7035 7035 7035 7035 ... 7035 7035 7035 7035 7035 7035 7035 7035 7035 7035
Yellow_3 3292 3292 3292 3292 3292 3292 3292 3292 3292 3292 ... 3292 3292 3292 3292 3292 3292 3292 3292 3292 3292

9 rows × 25 columns

In [113]:
cong_cluster_df_result
Out[113]:
record_id track_id type traveled_d avg_speed lat lon speed lon_acc lat_acc ... y_lag bearing bearing_lead x_id y_id seg_dir_lane_cluster time_bin count time_bin_count cong_flag
5 499233 214 Car 310.69 12.427559 37.992638 23.731501 0.0008 0.0022 -0.0000 ... 4.578386e+06 90.0 NaN 264177860 457838586 Green_1 0.00 1 2 0
8 553782 243 Taxi 551.56 15.809148 37.992585 23.731490 0.7859 -0.1303 0.0165 ... 4.578378e+06 90.0 NaN 264177738 457837837 Green_1 0.00 1 2 0
14 499234 214 Car 310.69 12.427559 37.992638 23.731501 0.0012 0.0035 -0.0000 ... 4.578386e+06 90.0 90.0 264177860 457838586 Green_1 0.04 1 2 0
16 553783 243 Taxi 551.56 15.809148 37.992585 23.731490 0.7790 0.0345 0.0172 ... 4.578378e+06 90.0 90.0 264177738 457837837 Green_1 0.04 1 2 0
22 499235 214 Car 310.69 12.427559 37.992638 23.731501 0.0018 0.0051 -0.0000 ... 4.578386e+06 90.0 90.0 264177860 457838586 Green_1 0.08 1 2 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1712714 6318002 2269 Car 25.44 8.326656 37.992559 23.731484 1.9567 0.0019 0.0250 ... 4.578375e+06 0.0 90.0 264177671 457837470 Green_1 809.76 1 3 0
1712715 6318951 2273 Car 18.24 7.294751 37.992628 23.731498 0.0000 0.0000 0.0000 ... 4.578384e+06 90.0 90.0 264177827 457838444 Green_1 809.76 1 3 0
1712719 6316874 2265 Medium Vehicle 35.06 9.709735 37.992478 23.731462 7.4699 -0.0551 -0.0455 ... NaN 0.0 0.0 264177426 457836326 Green_1 809.80 1 3 0
1712720 6318003 2269 Car 25.44 8.326656 37.992558 23.731484 1.9580 -0.0085 0.0228 ... NaN 0.0 0.0 264177671 457837456 Green_1 809.80 1 3 0
1712721 6318952 2273 Car 18.24 7.294751 37.992628 23.731498 0.0000 0.0000 0.0000 ... NaN 90.0 90.0 264177827 457838444 Green_1 809.80 1 3 0

1106031 rows × 26 columns

In [114]:
# build intermediate dataframe
# https://stackoverflow.com/questions/35234012/python-pandas-merge-two-tables-without-keys-multiply-2-dataframes-with-broadc
# calculate Euclidean distance
# more resources for more complex examples: https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/
def e_dist(x1, x2, y1, y2):
    return np.sqrt((x1-x2) ** 2+(y1-y2) ** 2)


def getQueue(df):
    """This function requires the dataframe input to be groupped into appropriate clusters"""

    df1 = df.copy()

    df1['tmp'] = 1
    
    if len(df1) > 0:
        # put data in list

        df_dist = pd.merge(df1, df1, on=['tmp'], suffixes=(
            '_1', '_2'))

        df_dist['dist'] = e_dist(
            x1=df_dist['x_1'],
            x2=df_dist['x_2'],
            y1=df_dist['y_1'],
            y2=df_dist['y_2'])

        # get maximum distance in each group
    #     idx = df_dist.groupby(['cong_flag_1'])['dist'].transform(max) == df_dist['dist']
        idx = df_dist['dist'].max() == df_dist['dist']
        
        # keeping the first is good enough - idx will return 2 copy
        result = df_dist[idx].iloc[0]

        return result
In [115]:
# queue_calc_eval_list = [x for i, x in cong_cluster_df_result.groupby(
#     ['seg_dir_lane_cluster', 'time_bin', 'cong_flag'], sort=False)]
In [116]:
# # test
# getQueue(queue_calc_eval_list[0])
In [117]:
ct_queue_calc_result = cong_cluster_df_result.groupby(
    ['seg_dir_lane_cluster', 'time_bin', 'cong_flag']).apply(lambda grp: getQueue(grp))
In [118]:
ct_queue_calc_result_final = ct_queue_calc_result.\
    reset_index()\
    [['seg_dir_lane_cluster', 'time_bin', 'cong_flag', 'record_id_1',
        'record_id_2', 'lat_1', 'lon_1', 'lat_2', 'lon_2', 'dist']]

# for each row is a recorded queue
# where 
# seg_dir_lane_cluster is the road segment direction and lane cluster group
# time_bin is the time stamp
# cong_flag is the queue number
# record_id_1 record id of the track and time of the start of the queue
# lat_1 is the latitude of start of the queue
# lon_1 is the longitude of start of the queue
# record_id_2 record id of the track and time of the end of the queue
# lat_2 is the latitude of end of the queue
# lon_2 is the longitude of end of the queue
# dist is the queue length
In [119]:
# save all the queues
ct_queue_calc_result_final.to_csv('uas4t_tl_team_queue-revised.csv', index=False)

Report maximum queue for each cluster

In [120]:
# for each cluster, find the max queue length
# then report
## i. Maximum length of queue, 
## ii. Lane the maximum length occurred, 
## iii. Coordinates of the start and end of the maximum queue, 
## iv. Timestamp of the maximum queue occurrence, and v. whether, when and where a spillback is formed (when applicable).
In [121]:
ct_queue_calc_result_final
Out[121]:
seg_dir_lane_cluster time_bin cong_flag record_id_1 record_id_2 lat_1 lon_1 lat_2 lon_2 dist
0 Green_1 0.00 0 499233 553782 37.992638 23.731501 37.992585 23.731490 7.585852
1 Green_1 0.04 0 499234 553783 37.992638 23.731501 37.992585 23.731490 7.585852
2 Green_1 0.08 0 499235 553784 37.992638 23.731501 37.992585 23.731490 7.585852
3 Green_1 0.12 0 499236 553785 37.992638 23.731501 37.992585 23.731490 7.585852
4 Green_1 0.16 0 499237 553786 37.992638 23.731501 37.992584 23.731490 7.725285
... ... ... ... ... ... ... ... ... ... ...
180895 Yellow_3 786.16 0 6239568 6244817 37.991486 23.731718 37.991468 23.731659 7.042795
180896 Yellow_3 786.20 0 6239569 6244818 37.991486 23.731719 37.991468 23.731660 7.042795
180897 Yellow_3 786.24 0 6239570 6244819 37.991487 23.731720 37.991468 23.731660 7.198182
180898 Yellow_3 786.28 0 6239571 6244820 37.991487 23.731721 37.991468 23.731661 7.198182
180899 Yellow_3 786.32 0 6239572 6244821 37.991488 23.731722 37.991469 23.731661 7.301593

180900 rows × 10 columns

In [122]:
# max queue by cluster
max_dist = ct_queue_calc_result_final.\
    groupby(['seg_dir_lane_cluster']).\
    agg({'dist': np.max}).\
    rename(columns={'dist': 'max_queue_length'}).\
    reset_index()

max_queue_df = ct_queue_calc_result_final.merge(max_dist, on='seg_dir_lane_cluster')
max_queue_df = max_queue_df[max_queue_df['max_queue_length'] == max_queue_df['dist']]

max_queue_df.to_csv('uas4t_tl_team_results-revised.csv', index=False)
In [123]:
list(max_queue_df.columns)

# for each row is a recorded max queue per cluster over one or more time interval
# where 
# seg_dir_lane_cluster is the road segment direction cluster group
# time_bin is the time stamp
# cong_flag is the queue number
# record_id_1 record id of the track and time of the start of the queue
# lat_1 is the latitude of start of the queue
# lon_1 is the longitude of start of the queue
# record_id_2 record id of the track and time of the end of the queue
# lat_2 is the latitude of end of the queue
# lon_2 is the longitude of end of the queue
# max_queue_length is the maxmimum queue length (equals to dist, aka queue length)
Out[123]:
['seg_dir_lane_cluster',
 'time_bin',
 'cong_flag',
 'record_id_1',
 'record_id_2',
 'lat_1',
 'lon_1',
 'lat_2',
 'lon_2',
 'dist',
 'max_queue_length']

End of Notebook

In [ ]: